Zrm(list=ls(all=T))
knitr::opts_chunk$set(paged.print=FALSE, comment = NA)
pacman::p_load(magrittr, readr, caTools, ggplot2, dplyr, vcd, plotly)Z = read_csv("data/ta_feng_all_months_merged.csv") %>%
data.frame %>% setNames(c(
"date","cust","age","area","cat","prod","qty","cost","price"))Rows: 817741 Columns: 9
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (5): TRANSACTION_DT, CUSTOMER_ID, AGE_GROUP, PIN_CODE, PRODUCT_ID
dbl (4): PRODUCT_SUBCLASS, AMOUNT, ASSET, SALES_PRICE
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
[1] 817741
# A tibble: 817,741 × 9
date cust age area cat prod qty cost price
<chr> <chr> <chr> <chr> <dbl> <chr> <dbl> <dbl> <dbl>
1 11/1/2000 01104905 45-49 115 110411 4710199010372 2 24 30
2 11/1/2000 00418683 45-49 115 120107 4710857472535 1 48 46
3 11/1/2000 01057331 35-39 115 100407 4710043654103 2 142 166
4 11/1/2000 01849332 45-49 Others 120108 4710126092129 1 32 38
5 11/1/2000 01981995 50-54 115 100205 4710176021445 1 14 18
6 11/1/2000 01741797 35-39 115 110122 0078895770025 1 54 75
7 11/1/2000 00308359 60-64 115 110507 4710192225520 1 85 105
8 11/1/2000 01607000 35-39 221 520503 4712936888817 1 45 68
9 11/1/2000 01057331 35-39 115 320203 4715398106864 2 70 78
10 11/1/2000 00236645 35-39 Unknown 120110 4710126091870 1 43 53
# ℹ 817,731 more rows
age.group = c("<25","25-29","30-34","35-39","40-44",
"45-49","50-54","55-59","60-64",">65")
Z$age = c(paste0("a",seq(24,69,5)),"a99")[match(Z$age,age.group,11)]
Z$area = paste0("z",Z$area)par(mfrow=c(1,2),cex=0.7)
table(Z$age, useNA='ifany') %>% barplot(main="Age Groups", las=2)
table(Z$area,useNA='ifany') %>% barplot(main="Areas", las=2) qty cost price
99% 6 858.0 1014.00
99.9% 14 2722.0 3135.82
99.95% 24 3799.3 3999.00
[1] 817182
把每一天、每一為顧客的交易項目彙總為一張訂單
Warning: The `...` argument of `group_indices()` is deprecated as of dplyr 1.0.0.
ℹ Please `group_by()` first
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.
XX = Z %>% group_by(tid) %>% summarise(
date = min(date), # 交易日期
cust = min(cust), # 顧客 ID
age = min(age), # 顧客 年齡級別
area = min(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame
nrow(X) # 119422 [1] 119422
items pieces total gross
99.9% 54 81.0000 9009.579 1824.737
99.95% 62 94.2895 10611.579 2179.817
99.99% 82 133.0000 16044.401 3226.548
tid date cust age
Min. : 1 Min. :2000-11-01 Length:119328 Length:119328
1st Qu.: 29855 1st Qu.:2000-11-29 Class :character Class :character
Median : 59705 Median :2001-01-01 Mode :character Mode :character
Mean : 59712 Mean :2000-12-31
3rd Qu.: 89581 3rd Qu.:2001-02-02
Max. :119422 Max. :2001-02-28
area items pieces total
Length:119328 Min. : 1.000 Min. : 1.000 Min. : 5.0
Class :character 1st Qu.: 2.000 1st Qu.: 3.000 1st Qu.: 227.0
Mode :character Median : 5.000 Median : 6.000 Median : 510.0
Mean : 6.802 Mean : 9.222 Mean : 851.6
3rd Qu.: 9.000 3rd Qu.:12.000 3rd Qu.: 1080.0
Max. :62.000 Max. :94.000 Max. :15345.0
gross
Min. :-1645.0
1st Qu.: 21.0
Median : 68.0
Mean : 130.9
3rd Qu.: 168.0
Max. : 3389.0
Ad0 = max(X$date) + 1
A = X %>% mutate(
days = as.integer(difftime(d0, date, units="days"))
) %>% group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = min(age), # age group
area = min(area), # area code
) %>% data.frame
nrow(A) # 32241[1] 32241
par(mfrow=c(1,2),cex=0.7)
table(A$age, useNA='ifany') %>% barplot(main="Age Groups",las=2)
table(A$area, useNA='ifany') %>% barplot(main="Areas",las=2) cust r s f
Length:32241 Min. : 1.00 Min. : 1.00 Min. : 1.000
Class :character 1st Qu.: 9.00 1st Qu.: 56.00 1st Qu.: 1.000
Mode :character Median : 26.00 Median : 92.00 Median : 2.000
Mean : 37.45 Mean : 80.78 Mean : 3.701
3rd Qu.: 60.00 3rd Qu.:110.00 3rd Qu.: 4.000
Max. :120.00 Max. :120.00 Max. :85.000
m rev raw age
Min. : 8.0 Min. : 8 Min. : -784.0 Length:32241
1st Qu.: 365.0 1st Qu.: 707 1st Qu.: 75.0 Class :character
Median : 705.7 Median : 1750 Median : 241.0 Mode :character
Mean : 993.1 Mean : 3152 Mean : 484.6
3rd Qu.: 1291.0 3rd Qu.: 3968 3rd Qu.: 612.0
Max. :12636.0 Max. :127686 Max. :20273.0
area
Length:32241
Class :character
Mode :character
par(mfrow=c(3,2), mar=c(3,3,4,2))
for(x in c('r','s','f','m'))
hist(A[,x],freq=T,main=x,xlab="",ylab="",cex.main=2)
hist(pmin(A$f,10),0:10,freq=T,xlab="",ylab="",cex.main=2)
hist(log(A$m,10),freq=T,xlab="",ylab="",cex.main=2)🌷 偏態分佈的處理方法
log(A$m, 10)pmin(A$f, 10) date cust age area cat prod qty cost price tid
0 0 0 0 0 0 0 0 0 0
tid date cust age area items pieces total gross
0 0 0 0 0 0 0 0 0
cust r s f m rev raw age area
0 0 0 0 0 0 0 0 0
確認資料沒有缺失值之後,將顧客資料存進A0,交易資料X存進X0,原始資料Z存進Z0
使用馬賽克圖檢視列連表的關聯性(Association between Categorial Variables)
MOSA = function(formula, data) mosaic(formula, data, shade=T,
margins=c(0,1,0,0), labeling_args = list(rot_labels=c(90,0,0,0)),
gp_labels=gpar(fontsize=9), legend_args=list(fontsize=9),
gp_text=gpar(fontsize=7),labeling=labeling_residuals)
MOSA(~age+area, A0)A0 %>% group_by(age) %>% summarise(
Group.Size = n(), # 族群人數
avg.Freq = mean(f), # 平均購買次數
avg.Revenue = sum(f*m)/sum(f) # 平均客單價
) %>%
ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
geom_point(aes(col=age, size=Group.Size), alpha=0.5) +
geom_text(aes(label=age)) +
scale_size(range=c(5,25)) +
theme_bw() + theme(legend.position="none") +
ggtitle("年齡區隔特徵 (泡泡大小:族群人數)") +
ylab("平均購買次數") + xlab("平均客單價")[1] 0.01941627
由於a99(沒有年齡資料的顧客)人數不多,而且特徵很獨特,探索時我們可以考慮濾掉這群顧客
A0 %>% filter(age!="a99") %>% # 濾掉沒有年齡資料的顧客('a99')
group_by(age) %>% summarise(
Group.Size = n(), # 族群人數
avg.Freq = mean(f), # 平均購買次數
avg.Revenue = sum(f*m)/sum(f) # 平均客單價
) %>%
ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
geom_point(aes(col=age, size=Group.Size), alpha=0.5) +
geom_text(aes(label=age)) +
scale_size(range=c(5,25)) +
theme_bw() + theme(legend.position="none") +
ggtitle("年齡區隔特徵 (泡泡大小:族群人數)") +
ylab("平均購買次數") + xlab("平均客單價")A0 %>% filter(age!="a99") %>% # 濾掉沒有年齡資料的顧客('a99')
group_by(area) %>% summarise(
Group.Size = n(), # 族群人數
avg.Freq = mean(f), # 平均購買次數
avg.Revenue = sum(f*m)/sum(f) # 平均客單價
) %>%
ggplot(aes(y=avg.Freq, x=avg.Revenue)) +
geom_point(aes(col=area, size=Group.Size), alpha=0.5) +
geom_text(aes(label=area)) +
scale_size(range=c(5,25)) +
theme_bw() + theme(legend.position="none") +
ggtitle("地理區隔特徵 (泡泡大小:族群人數)") +
ylab("平均購買次數") + xlab("平均客單價")※ 「年齡」與「地區」之間有很高的關聯性
§ 南港(z115)30~40歲的顧客比率比較低
§ 汐止(z221)、內湖(z114)和其他(zOthers)30~40歲的顧客比率比較高
※ 「平均購買次數」和「平均客單價」之間有明顯的負相關
§ 住的遠(近)的人比較少(常)來買、但每一次買的比較多(少)
§ 30~40歲(年輕和年長)的人比較少(常)來買、但每一次買的比較多(少)
不同年齡、地區的顧客喜歡買的品類看來也不太一樣
X0$wday = format(X0$date, "%u")
par(cex=0.7, mar=c(2,3,2,1))
table(X0$wday) %>% barplot(main="No. Transactions in Week Days")col6 = c('seagreen','gold','orange',rep('red',3))
gg= group_by(Z0, cat) %>% summarise(
solds = n(), qty = sum(qty), rev = sum(price), cost = sum(cost),
profit = rev - cost, margin = 100*profit/rev
) %>%
top_n(100, profit) %>%
ggplot(aes(x=margin, y=rev, col=profit, label=cat)) +
geom_point(size=2,alpha=0.8) + scale_y_log10() +
scale_color_gradientn(colors=col6) +
theme_bw()
ggplotly(gg)背景
我們假設這家量販店大北百貨開設在汐止與南港等地區,而較多的顧客年齡都落在30-44歲,並且喜歡在周末時造訪,考量到此消費族群人數較多與購買平均客單價最高,因此我們將此訂為我們的TA
由上面馬賽克圖得知,可發現南港30-40歲的消費客群人數較低,所以我們將行銷目標訂為「提升南港地區30-40歲顧客比率」 ,而我們推測造成這樣的現象可能有以下四點原因:
針對第一點原因本組預計透過分析「產品組合」,進一步提出的策略:
針對第二點原因我們提出以下的策略: - 因為此族群多為假日消費、開車、家庭採購,我們推論在南港地區的店家周邊可能有停車位不足的問題,所以認為可以跟附近停車場合作,藉由增加停車位解決店面不易到達問題
針對第三、四點原因,我們提出以下策略: - 與附近五公里的店家進行聯盟行銷 - 開設粉絲專業、成立社群推播優惠資訊